{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np \n",
    "import pandas as pd "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 忽略警告信息\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据集的准备"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "train=pd.read_csv('datas/house_data.csv')\n",
    "y=train['SalePrice']\n",
    "train1=train.drop(['Id','SalePrice'],axis=1)\n",
    "X=pd.get_dummies(train1).reset_index(drop=True)\n",
    "X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=123)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# tmp=train.isnull().sum()\n",
    "# tmp[tmp>0] "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 模型测评"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import mean_squared_error\n",
    "def benchmark(model,testset,label):\n",
    "    pred=model.predict(testset)\n",
    "    if pred[pred<0].shape[0]>0:\n",
    "        print('Neg Value')\n",
    "    rmse=np.sqrt(mean_squared_error(label,pred))\n",
    "    lrmse=np.sqrt(mean_squared_error(np.log(label),np.log(pred)))\n",
    "\n",
    "    print('RMSE:',rmse)\n",
    "    print('LRMSE:',lrmse)\n",
    "    return lrmse"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 基础模型训练"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### ElasticNet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import RidgeCV\n",
    "from sklearn.linear_model import LassoCV\n",
    "from sklearn.linear_model import ElasticNetCV\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.preprocessing import RobustScaler\n",
    "from sklearn.model_selection import KFold"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "kfolds=KFold(n_splits=10, shuffle=True, random_state=123)\n",
    "e_l1ratio=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.85,0.9,0.95]\n",
    "e_alphas=np.logspace(-10,2.8,150)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def elastic_train_test(alpha,l1ratio):\n",
    "    e_model=make_pipeline(RobustScaler(),ElasticNetCV(alphas=[alpha],l1_ratio=[l1ratio]))\n",
    "    e_model.fit(X_train,y_train)\n",
    "    lrmse=benchmark(e_model,X_test,y_test)\n",
    "    return lrmse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RMSE: 64803.88956616406\n",
      "LRMSE: 0.3056812482960621\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.3056812482960621"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "elastic_train_test(50,0.5) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "elastic_model=make_pipeline(RobustScaler(), ElasticNetCV(alphas=e_alphas, l1_ratio=e_l1ratio)).fit(X_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RMSE: 25991.07955736571\n",
      "LRMSE: 0.12567210233778722\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.12567210233778722"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "benchmark(elastic_model,X_test,y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.3432183268134919"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "elastic_model.steps[1][1].alpha_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "elastic_model.steps[1][1].l1_ratio_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### XGBoost训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "import xgboost as xgb\n",
    "xg_reg=xgb.XGBRegressor(objective='reg:linear',colsample_bytree=0.7,learning_rate=0.01,max_depth=3,n_estimators=3400,subsample=0.7,nthread=6,seed=123)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
       "             colsample_bytree=0.7, gamma=0, importance_type='gain',\n",
       "             learning_rate=0.01, max_delta_step=0, max_depth=3,\n",
       "             min_child_weight=1, missing=None, n_estimators=3400, n_jobs=1,\n",
       "             nthread=6, objective='reg:linear', random_state=0, reg_alpha=0,\n",
       "             reg_lambda=1, scale_pos_weight=1, seed=123, silent=True,\n",
       "             subsample=0.7)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "xg_reg.fit(X_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RMSE: 22926.489730019464\n",
      "LRMSE: 0.10024704840338212\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.10024704840338212"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "benchmark(xg_reg,X_test,y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<br><br>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Stacking集成算法"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 底层算法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "from mlxtend.regressor import StackingCVRegressor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "alphas_alt=np.logspace(-10,2.8,150)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "ridge=make_pipeline(RobustScaler(),RidgeCV(alphas=alphas_alt,cv=kfolds))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "lasso=make_pipeline(RobustScaler(),LassoCV(alphas=alphas_alt,cv=kfolds))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "elasticnet=make_pipeline(RobustScaler(),ElasticNetCV(alphas=e_alphas,cv=kfolds, l1_ratio=e_l1ratio))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgboost=make_pipeline(RobustScaler(),xgb.XGBRegressor(objective='reg:linear',colsample_bytree=0.7,learning_rate=0.01,max_depth=3,n_estimators=3460,subsample=0.7,reg_alpha=0.00006,gamma=0,nthread=6,scale_pos_weight=1,seed=27))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 上层算法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 是否使用原训练集中的feature\n",
    "stack_alg=StackingCVRegressor(regressors=(ridge, lasso, elasticnet, xgboost),\n",
    "                              meta_regressor=xgboost, use_features_in_secondary=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "stackX=np.array(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "stacky=np.array(y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "stack_alg.fit(stackX, stacky)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "benchmark(stack_alg, X_test, y_test)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4rc1"
  },
  "toc-autonumbering": false
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
